set.seed(1)
required_packages <- c("tidyverse", "magrittr", "DBI", "bigrquery", "arrow","glue", "vroom","janitor", "gt", "ggwordcloud", "readxl", "ggthemes", "hrbrthemes", "extrafont", "plotly", "scales", "stringr", "gganimate", "here", "tidytext", "sentimentr", "scales", "DT", "here", "sm", "mblm", "glue", "fs", "knitr", "rmdformats", "janitor", "urltools", "colorspace", "pdftools", "showtext", "pander")
for(i in required_packages) { 
  if(!require(i, character.only = T)) {
    #  if package is not existing, install then load the package
    install.packages(i, dependencies = T)
  require(i, character.only = T)
  }
}
panderOptions('table.alignment.default', "left")
## quality of png's
dpi <- 750
## theme updates; please adjust to client´s website
#theme_set(ggthemes::theme_clean(base_size = 15))
theme_set(ggthemes::theme_clean(base_size = 15, base_family = "Montserrat"))
theme_update(plot.margin = margin(30, 30, 30, 30),
             plot.background = element_rect(color = "white",
                                            fill = "white"),
             plot.title = element_text(size = 20,
                                       face = "bold",
                                       lineheight = 1.05,
                                       hjust = .5,
                                       margin = margin(10, 0, 25, 0)),
             plot.title.position = "plot",
             plot.caption = element_text(color = "grey40",
                                         size = 9,
                                         margin = margin(20, 0, -20, 0)),
             plot.caption.position = "plot",
             axis.line.x = element_line(color = "black",
                                        size = .8),
             axis.line.y = element_line(color = "black",
                                        size = .8),
             axis.title.x = element_text(size = 16,
                                         face = "bold",
                                         margin = margin(t = 20)),
             axis.title.y = element_text(size = 16,
                                         face = "bold",
                                         margin = margin(r = 20)),
             axis.text = element_text(size = 11,
                                      color = "black",
                                      face = "bold"),
             axis.text.x = element_text(margin = margin(t = 10)),
             axis.text.y = element_text(margin = margin(r = 10)),
             axis.ticks = element_blank(),
             panel.grid.major.x = element_line(size = .6,
                                               color = "#eaeaea",
                                               linetype = "solid"),
             panel.grid.major.y = element_line(size = .6,
                                               color = "#eaeaea",
                                               linetype = "solid"),
             panel.grid.minor.x = element_line(size = .6,
                                               color = "#eaeaea",
                                               linetype = "solid"),
             panel.grid.minor.y = element_blank(),
             panel.spacing.x = unit(4, "lines"),
             panel.spacing.y = unit(2, "lines"),
             legend.position = "top",
             legend.title = element_text(family = "Montserrat",
                                         color = "black",
                                         size = 14,
                                         margin = margin(5, 0, 5, 0)),
             legend.text = element_text(family = "Montserrat",
                                        color = "black",
                                        size = 11,
                                        margin = margin(4.5, 4.5, 4.5, 4.5)),
             legend.background = element_rect(fill = NA,
                                              color = NA),
             legend.key = element_rect(color = NA, fill = NA),
             #legend.key.width = unit(5, "lines"),
             #legend.spacing.x = unit(.05, "pt"),
             #legend.spacing.y = unit(.55, "pt"),
             #legend.margin = margin(0, 0, 10, 0),
             strip.text = element_text(face = "bold",
                                       margin = margin(b = 10)))
## theme settings for flipped plots
theme_flip <-
  theme(panel.grid.minor.x = element_blank(),
        panel.grid.minor.y = element_line(size = .6,
                                          color = "#eaeaea"))
## theme settings for maps
theme_map <- 
  theme_void(base_family = "Montserrat") +
  theme(legend.direction = "horizontal",
        legend.box = "horizontal",
        legend.margin = margin(10, 10, 10, 10),
        legend.title = element_text(size = 17, 
                                    face = "bold"),
        legend.text = element_text(color = "grey33",
                                   size = 12),
        plot.margin = margin(15, 5, 15, 5),
        plot.title = element_text(face = "bold",
                                  size = 20,
                                  hjust = .5,
                                  margin = margin(30, 0, 10, 0)),
        plot.subtitle = element_text(face = "bold",
                                     color = "grey33",
                                     size = 17,
                                     hjust = .5,
                                     margin = margin(10, 0, -30, 0)),
        plot.caption = element_text(size = 14,
                                    color = "grey33",
                                    hjust = .97,
                                    margin = margin(-30, 0, 0, 0)))
## numeric format for labels
num_format <- scales::format_format(big.mark = ",", small.mark = ",", scientific = F)
## main color backlinko
bl_col <- "#00d188"
bl_dark <- darken(bl_col, .3, space = "HLS")
## colors + labels for interval stripes
int_cols <- c("#bce2d5", "#79d8b6", bl_col, "#009f66", "#006c45", "#003925")
int_perc <- c("100%", "95%", "75%", "50%", "25%", "5%")
## colors for degrees (Bachelors, Massters, Doctorate in reverse order)
cols_degree <- c("#e64500", "#FFCC00", darken(bl_col, .1))
## gradient colors for position
colfunc <- colorRampPalette(c(bl_col, "#bce2d5"))
pos_cols <- colfunc(10)
display_number <- function(n, title){
  tibble(!!sym(title) := format(n, big.mark = ",")) %>% pander()
}
con <- dbConnect(
    bigrquery::bigquery(),
    project = "dataforseo-bigquery",
    billing = "dataforseo-bigquery"
)
sql <- glue("SELECT * FROM `dataforseo-bigquery.dataforseo_data.keyword_data` 
          WHERE location = 2840 
          AND keyword_info_search_volume > 0
          ORDER BY keyword_info_search_volume DESC
          LIMIT 50000")
tb <- bq_project_query("dataforseo-bigquery", sql)
top <- bq_table_download(tb, max_results = 50000)

!!!J: In this version I remove all the entries with zero volume for all stats. I think if we prefer to go by count rather than by volume, this is the best approach.

Basic stats

SELECT COUNT(keyword_info_search_volume) as total_count
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE `location` = 2840
AND keyword_info_search_volume > 0
total_count <- sql$total_count
tibble("Total number of searches" = glue("~{format(round(total_count / 1000000))} million")) %>% 
  pander()
Total number of searches
~306 million
SELECT SUM(COALESCE(keyword_info_search_volume / 10000, 0)) AS total_volume
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE location = 2840
# Calculated in a roundabout way to avoid integer overflow
total_volume <- sql$total_volume * 10000
tibble("Total volume of searches" = glue("~{format(round(total_volume / 1000000000))} billion")) %>% 
  pander()
Total volume of searches
~303 billion


This table shows the top 10 searches. They are all spelling errors. As in, they are not really searched, but rather people attempting to go to Youtube or Facebook, but typing it wrong. Oddly they are all attributed as having a search volume of exactly 185 million.

top %>% 
  select(keyword, location, spell, spell_type, keyword_info_search_volume) %>% 
  head(10) %>% 
  gt() %>% 
  tab_options(table.align = "left") %>% 
  tab_header("Top searches")
Top searches
keyword location spell spell_type keyword_info_search_volume
htps :/ youtube com 2840 youtube did_you_mean 185000000
yout be 2840 youtube showing_results_for 185000000
the facebook 2840 185000000
faubook 2840 185000000
ytuo 2840 youtube did_you_mean 185000000
youtubr 2840 youtube showing_results_for 185000000
xoyoutube 2840 youtube showing_results_for 185000000
utube 2840 youtube did_you_mean 185000000
youtube cooom 2840 com showing_results_for 185000000
youbra 2840 youtube did_you_mean 185000000
SELECT COUNT(*) as missing_count
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE `location` = 2840
AND keyword_info_search_volume IS NULL
missing_count <- sql$missing_count
tibble("Missing search volume" = scales::percent(missing_count / total_count, accuracy = 0.001)) %>% pander()
Missing search volume
0.975%

The missing have some searches that are likely high volume. Thus they are truly missing, and not just 0s.

SELECT keyword, keyword_info_search_volume
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE `location` = 2840
AND keyword_info_search_volume IS NULL
ORDER BY RAND()
LIMIT 10
sql %>% gt() %>% tab_options(table.align = "left") %>% 
  tab_header("Keywords with missing search volume")
Keywords with missing search volume
keyword keyword_info_search_volume
GoPro Hero 4 Battery Charger NA
किसान क्रेडिट कार्ड हेल्पलाइन नंबर Bihar NA
Chatwork サービス NA
Alice Munro Selected Stories NA
Mesa College Baseball NA
Brother Fax-575 NA
My Heart Beets butter Chicken NA
Albertsons Ad NA
Kaju Katli Recipe with milk NA
Bible Proverbs about life NA
display_number(total_volume / total_count, "Mean search volume")
Mean search volume
988.6631
SELECT approx_quantiles(keyword_info_search_volume, 2)[offset(1)] AS median
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE location = 2840
AND keyword_info_search_volume > 0
display_number(sql$median, "Median search volume")
Median search volume
10
SELECT AVG(`keyword_info_cpc`) AS mean_cpc
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE `location` = 2840
AND keyword_info_search_volume > 0
display_number(sql$mean_cpc, "Mean CPC")
Mean CPC
0.6069549
SELECT approx_quantiles(keyword_info_cpc, 2)[offset(1)] AS median_cpc
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE location = 2840
AND keyword_info_search_volume > 0
display_number(sql$median_cpc, "Median CPC")
Median CPC
0


Spell types

SELECT spell_type, SUM(keyword_info_search_volume) / 10000 AS `volume`
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE location = 2840
GROUP BY spell_type
spell_types <- sql %>% mutate(spell_type = ifelse(spell_type == "", "no_spell_type", spell_type)) %>% 
  mutate(volume = volume / sum(volume)) 
spell_types %>% ggplot(aes(x = reorder(spell_type, volume), y = volume)) +
  geom_bar(stat = "identity", width = 0.8, fill = "turquoise4", color = "black") +
  labs(x = "", y = "", title = "Spell types - by volume") +
  scale_y_continuous(labels = scales::percent)


About half of search volume has a spell type. This is especially driven by misspellings of common domains.

If going by count instead of by volume, almost none of the searches have a spell type:

SELECT spell_type, COUNT(spell_type) as n
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE location = 2840
ANd keyword_info_search_volume > 0
GROUP BY spell_type
prop <- 1 - sql %>% filter(spell_type == "") %>% pull(n) / total_count
tibble("Proportion with spell type" = scales::percent(prop, accuracy = 0.001)) %>% pander()
Proportion with spell type
1.389%
spell_types <- sql %>% filter(spell_type != "")
spell_types %>% mutate(prop = n / sum(n)) %>% 
  ggplot(aes(x = reorder(spell_type, prop), y = prop)) +
  geom_bar(stat = "identity", width = 0.8, fill = "turquoise4", color = "black") +
  labs(x = "", y = "", title = "Spell types - by count") +
  scale_y_continuous(labels = scales::percent)


top %>% group_by(spell) %>% 
  summarise(volume = sum(keyword_info_search_volume)) %>% 
  arrange(desc(volume)) %>% 
  filter(spell != "") %>% 
  mutate(volume = scales::percent(volume / sum(volume), accuracy = 0.1)) %>% 
  head(10) %>% 
  gt() %>% 
  tab_options(table.align = "left") %>% 
  tab_header("Top 10 intended searches that are misspelled")
Top 10 intended searches that are misspelled
spell volume
youtube 35.3%
facebook 8.7%
amazon 7.6%
google 6.3%
weather 2.2%
translate 1.6%
com 1.5%
instagram 1.3%
walmart 1.3%
ebay 1.2%



Questions

question_words <- c("what", "which", "where", "who", "why", "how")
write_questions_volume <- function(){
  questions <- tribble(~question, ~volume)
  for (word in question_words){
    sql <- glue("SELECT sum(keyword_info_search_volume) FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
              WHERE location = 2840 
              AND keyword like '{word} %' ")
    tb <- bq_project_query("dataforseo-bigquery", sql)
    df <- bq_table_download(tb) 
    questions %<>% add_row(question = word, volume = df$f0_)
  }
  write_csv(questions, "../proc_data/questions_volume.csv")
}
#write_questions_volume()
questions <- read_csv("../proc_data/questions_volume.csv")

questions %>% mutate(prop = volume / total_volume) %>% 
  ggplot(aes(x = reorder(question, prop), y = prop)) +
  geom_bar(stat = "identity", fill = "turquoise4", color = "black") +
  scale_y_continuous(labels = scales::percent, limits = c(0, 0.005), expand = c(0,0)) +
  labs(title = "Questions in searches - by volume", x = "", y = "")

write_questions_count <- function(){
  questions <- tribble(~question, ~n)
  for (word in question_words){
    sql <- glue("SELECT COUNT(keyword_info_search_volume) as n
              FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
              WHERE location = 2840
              AND keyword_info_search_volume > 0
              AND keyword like '{word} %' ")
    tb <- bq_project_query("dataforseo-bigquery", sql)
    df <- bq_table_download(tb) 
    questions %<>% add_row(question = word, n = df$n)
  }  
  write_csv(questions, "../proc_data/questions_count.csv")
}
write_questions_count()
questions <- read_csv("../proc_data/questions_count.csv")
questions %>% mutate(prop = n / total_count) %>% 
  ggplot(aes(x = reorder(question, prop), y = prop)) +
  geom_bar(stat = "identity", fill = "turquoise4", color = "black") +
  scale_y_continuous(labels = scales::percent, limits = c(0, 0.025), expand = c(0,0)) +
  labs(title = "Questions in searches - by count", x = "", y = "")

tibble("Total percentage of searches that are questions" = 
         scales::percent(questions %$% sum(n) / total_count, accuracy = 0.001)) %>% 
  pander()
Total percentage of searches that are questions
3.632%


Stopwords

stopword_list <- tibble(stopword = stopwords::stopwords(language = "en")) %>% 
  mutate(stopword = str_remove(stopword, "'")) %>% 
  filter(!(stopword %in% c("shed", "wed", "ill", "hell", "shell")))
get_stopwords_counts <- function(){
  stopwords <- tribble(~stopword, ~n)
    for (word in stopword_list$stopword){
      print(word)
      sql <- glue(
        "SELECT COUNT(keyword_info_search_volume) AS stopword_count
         FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
         WHERE location = 2840 
         AND keyword_info_search_volume > 0
         AND keyword like '% {word} %' OR keyword like '{word} %' OR keyword like '% {word}'")
      tb <- bq_project_query("dataforseo-bigquery", sql)
    
      df <- bq_table_download(tb) 
      stopwords %<>% add_row(stopword = word, n = df$stopword_count)
    }
    
    write_csv(stopwords, "../proc_data/stopwords.csv")
}
#get_stopwords_counts()
stopwords <- read_csv("../proc_data/stopwords.csv")
stopwords %>% mutate(prop = n / total_count) %>%
  arrange(desc(prop)) %>% 
  head(10) %>% 
  ggplot(aes(x = reorder(stopword, prop), y = prop)) +
  geom_bar(stat = "identity", color = "black", fill = "turquoise4", width = 0.7) +
  scale_y_continuous(labels = scales::percent, expand = c(0,0), limits = c(0, 0.085)) +
  labs(x = "", y = "", title = "Searches with specific stopwords") +
  coord_flip()

!!!D: the graph is not that insightful. Please make the wordcloud a bit more readable and visually appealing. Feel to add more than 25 words.

stopwords %>% 
  mutate(stopword = ifelse(stopword == "i", "I", stopword)) %>% 
  arrange(desc(n)) %>% 
  head(25) %>% 
  ggplot(aes(label = stopword, size = n, color = factor(sample.int(10, 25, replace = TRUE)))) +
  geom_text_wordcloud() +
  scale_size_area(max_size = 50) +
  theme_minimal()


Search tails

volume_top <- top %>%  
  add_rownames() %>% 
  mutate(rowname = as.numeric(rowname)) %>% 
  select(rowname, volume = keyword_info_search_volume)
ylab <- c(50, 100, 150, 200)
volume_top %>% 
  filter(rowname < 10000) %>% 
  mutate(cat = case_when(
    rowname < 500 ~ "Top 500",
    rowname < 2000 ~ "Top 2000",
    rowname < 10000 ~ "Top 10000"
  )) %>% 
  mutate(cat = factor(cat, levels = c("Top 500", "Top 2000", "Top 10000"))) %>% 
  head(10500) %>% 
  ggplot(aes(x = rowname, y = volume, fill = cat)) +
  geom_area(alpha = 0.8) +
  scale_y_continuous(
    labels = glue("{ylab} M"),
    breaks = 10^6 * ylab,
    limits = c(0, 200* 10^6), 
    expand = c(0,0)
    ) +
  labs(x = "", title = "Long tail", fill = "") + 
  annotate("text", x = 9800, y = 75*10^6, label = "Remaining 99.9967%") + 
  geom_segment(aes(x = 8900, y = 35*10^6, xend = 11500, yend = 35*10^6),
               arrow = arrow(length = unit(0.5, "cm"))) +
  scale_x_continuous(expand = c(0,0), limits = c(0, 11500)) +
  ggeasy::easy_remove_axes(which = "x", what = "text")

!!!J: Better title


get_count_range <-  function(lower, higher)
{
  sql <- glue(
        "SELECT COUNT(*) AS `count` 
         FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
         WHERE location = 2840 
         AND keyword_info_search_volume >= {lower} 
         AND keyword_info_search_volume <= {higher}")
      tb <- bq_project_query("dataforseo-bigquery", sql)
      bq_table_download(tb)$count
}
df <- tribble(
  ~cat, ~count,
  "0 - 10", get_count_range(0, 10),
  "11- 100", get_count_range(11, 100),
  "101 - 1000", get_count_range(101, 1000),
  "1001 - 10000", get_count_range(1001, 10000),
  "10001 - 100000", get_count_range(10001, "100000"),
  "100001+", get_count_range("100001", "100000000000")) 
df %>% 
  mutate(count = count / sum(count)) %>% 
  ggplot(aes(x = reorder(cat, desc(count)), y = count)) +
  geom_bar(stat = "identity", fill = "turquoise4", color = "black") +
  theme_light() +
  labs(x = "Search phrase length", y = "Percentage of all searches", title = "Long tail") +
  scale_y_continuous(labels = scales::percent, limits = c(0, 1), expand = c(0,0))

!!!J: Better title


Keyword length

write_length_volume <- function()
{
  get_length_volume <-  function(l)
  {
    sql <- glue(
          "SELECT sum(keyword_info_search_volume) / 10000 as `volume`, count(keyword_info_search_volume) as `count`
           FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
           WHERE location = 2840
           AND keyword_info_search_volume IS NOT NULL
           AND LENGTH(keyword) = {l}")
        tb <- bq_project_query("dataforseo-bigquery", sql)
        bq_table_download(tb) %>% mutate(length = l, volume = volume * 10000)
  }
  
  df <- map_df(1:50, get_length_volume)  
  write_csv(df, "../proc_data/keyword_length_volume.csv")
}
df <- read_csv("../proc_data/keyword_length_volume.csv")
df %>% mutate(prop = volume / sum(volume)) %>% 
  ggplot(aes(x = length, y = prop)) +
  geom_line(color = "turquoise4", size = 1) + geom_point(color = "turquoise4") +
  theme_light() +
  labs(x = "Keyword length", y = "Total search volume", title = "Search volume by keyword length") +
  scale_y_continuous(labels = scales::percent)

The most searched queries have length 6-9 characters, and falls continuously for search queries longer or shorter than that.


Keyword_info categories

pservices <- read_csv("../raw_data/productsservices.csv") %>% 
  clean_names() %>% rename(c1 = criterion_id) %>% select(-category) %>% 
  separate(c1, sep =",\"", into = c("id", "category")) %>% 
  mutate(category = substr(category, 2, nchar(category) -1)) %>% 
  separate(category, sep = "/", into = c("cat1", "cat2", "cat3", "cat4", "cat5", "cat6", "cat7", "cat8"))
toplevel <- pservices %>% filter(is.na(cat2))
write_categories <- function()
{
  get_category_volume <-  function(id){
    sql <- glue(
      "SELECT SUM(keyword_info_search_volume) / 10000 AS `search_volume`, AVG(keyword_info_cpc) AS `cpc`, COUNT(*) AS `count`
       FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
       WHERE location = 2840 
       AND keyword_info_categories like '%{id}%' ")
    tb <- bq_project_query("dataforseo-bigquery", sql)
    bq_table_download(tb) %>% mutate(id = id)
  }
  df <- map_df(toplevel$id, get_category_volume)  
  
  df %>% mutate(search_volume = search_volume * 10000,
                mean_volume = search_volume / count
                ) %>% 
    left_join(toplevel %>% select(id, cat1), by = "id") %>% 
    write_csv("../proc_data/categories_averages.csv")
}
df <- read_csv("../proc_data/categories_averages.csv")
df %>% 
  ggplot(aes(x = fct_rev(cat1), y = mean_volume)) +
  geom_bar(stat = "identity", fill = "turquoise4", color = "black", width = 0.8) +
  theme_light() +
  coord_flip() +
  scale_y_continuous(limits = c(0, 9000), expand = c(0,0)) +
  labs(x = "", y = "", title = "Search volume mean by category")

!!!D: I would be curious to see the same graph with median. Wouldn´t it be better to use median given the skewed data set?

!!!J: I am skeptical for using the median, since it is brought far down by a large number of searches with low volume, even when we exclude 0 volume searches. Remember, the median cpc overall was 0. But you are right that it is an issue with the skewed data set.

Median:

write_categories_medians <- function()
{
  get_category_median <-  function(id){
    sql <- glue(
      "SELECT approx_quantiles(keyword_info_search_volume, 2)[offset(1)] AS median_volume, approx_quantiles(keyword_info_cpc, 2)[offset(1)] AS median_cpc
       FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
       WHERE location = 2840 
       AND keyword_info_search_volume > 0
       AND keyword_info_categories like '%{id}%' ")
    tb <- bq_project_query("dataforseo-bigquery", sql)
    bq_table_download(tb) %>% mutate(id = id)
  }
  df <- map_df(toplevel$id, get_category_median)  
  
  df %>% left_join(toplevel %>% select(id, cat1), by = "id") %>% 
    write_csv("../proc_data/categories_medians.csv")
}
#write_categories_medians()
df <- read_csv("../proc_data/categories_medians.csv")


df %>% 
  ggplot(aes(x = fct_rev(cat1), y = median_cpc)) +
  geom_bar(stat = "identity", fill = "turquoise4", color = "black", width = 0.8) +
  theme_light() +
  coord_flip() +
  scale_y_continuous(limits = c(0, 2), expand = c(0,0)) +
  labs(x = "", y = "", title = "CPC median by category")